from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.stattools import pacf,acf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
from itertools import product
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
df = pd.read_csv(r'C:\Users\muhammad.usman38\Downloads\83163_1980_1_1_2020.csv')
df.head()
| COOPID | year | month | day | meanTemp | |
|---|---|---|---|---|---|
| 0 | 83163 | 1980 | 1 | 1 | 61.5 |
| 1 | 83163 | 1980 | 1 | 2 | 55.0 |
| 2 | 83163 | 1980 | 1 | 3 | 60.5 |
| 3 | 83163 | 1980 | 1 | 4 | 66.5 |
| 4 | 83163 | 1980 | 1 | 5 | 57.5 |
df.dtypes
COOPID int64 year int64 month int64 day int64 meanTemp float64 dtype: object
df['Calculated_Date'] = df[['year', 'month', 'day']].apply(lambda x: '{}-{}-{}'.format(x[0], x[1], x[2]), axis=1)
df['Calculated_Date'].head()
0 1980-1-1 1 1980-1-2 2 1980-1-3 3 1980-1-4 4 1980-1-5 Name: Calculated_Date, dtype: object
df['Calculated_Date'] = pd.to_datetime(df['Calculated_Date'])
df['meanTemp'] = pd.to_numeric(df['meanTemp'])
df = df.drop(labels=['COOPID','year','month','day'], axis=1)
df.head()
| meanTemp | Calculated_Date | |
|---|---|---|
| 0 | 61.5 | 1980-01-01 |
| 1 | 55.0 | 1980-01-02 |
| 2 | 60.5 | 1980-01-03 |
| 3 | 66.5 | 1980-01-04 |
| 4 | 57.5 | 1980-01-05 |
df.dtypes
meanTemp float64 Calculated_Date datetime64[ns] dtype: object
import plotly.graph_objects as go
import pandas as pd
# Create figure
fig = go.Figure()
fig.add_trace(
go.Scatter(x =df['Calculated_Date'],y = df['meanTemp'])
# # Set title
# fig.update_layout(
# title_text="Time series with range slider and selectors"
)
# Add range slider
fig.update_layout(
xaxis=dict(
rangeselector=dict(
buttons=list([
dict(count=1,
label="1m",
step="month",
stepmode="backward"),
dict(count=6,
label="6m",
step="month",
stepmode="backward"),
dict(count=1,
label="YTD",
step="year",
stepmode="todate"),
dict(count=1,
label="1y",
step="year",
stepmode="backward"),
dict(step="all")
])
),
rangeslider=dict(
visible=True
),
type="date"
)
)
fig.show()
plot_pacf(df['meanTemp']);
plot_acf(df['meanTemp']);
from statsmodels.tsa.stattools import pacf,acf
import plotly.graph_objects as go
df_pacf = pacf(df['meanTemp'], nlags=300)
fig = go.Figure()
fig.add_trace(go.Scatter(
x= np.arange(len(df_pacf)),
y= df_pacf,
name= 'PACF',
))
fig.update_xaxes(rangeslider_visible=True)
fig.update_layout(
title="Partial Autocorrelation",
xaxis_title="Lag",
yaxis_title="Partial Autocorrelation",
height=500,
)
fig.show()
df_acf = acf(df['meanTemp'], nlags=300)
fig = go.Figure()
fig.add_trace(go.Scatter(
x= np.arange(len(df_acf)),
y= df_acf,
name= 'ACF',
))
fig.update_xaxes(rangeslider_visible=True)
fig.update_layout(
title="Autocorrelation",
xaxis_title="Lag",
yaxis_title="Autocorrelation",
height=500,
)
fig.show()
ad_fuller_result = adfuller(df['meanTemp'])
print(f'ADF Statistic: {ad_fuller_result[0]}')
print(f'p-value: {ad_fuller_result[1]}')
ADF Statistic: -8.521928239248195 p-value: 1.0947585212702767e-13
plt.hist(df['meanTemp'])
plt.show()
import plotly.express as px
fig = px.histogram(df, x=df['meanTemp'])
fig.update_xaxes(rangeslider_visible=True)
fig.show()
def optimize_SARIMA(parameters_list, d, D, s, exog):
results = []
for param in tqdm_notebook(parameters_list):
try:
model = SARIMAX(exog, order=(param[0], d, param[1]), seasonal_order=(param[2], D, param[3], s)).fit(disp=-1)
except:
continue
aic = model.aic
results.append([param, aic])
result_df = pd.DataFrame(results)
result_df.columns = ['(p,q)x(P,Q)', 'AIC']
#Sort in ascending order, lower AIC is better
result_df = result_df.sort_values(by='AIC', ascending=True).reset_index(drop=True)
return result_df
p = range(0, 3, 1)
d = 1
q = range(0, 3, 1)
P = range(0, 3, 1)
D = 1
Q = range(0, 3, 1)
s = 2
parameters = product(p, q, P, Q)
parameters_list = list(parameters)
print(len(parameters_list))
81
result_df = optimize_SARIMA(parameters_list, 1, 1, 3, df['meanTemp'])
result_df
0%| | 0/81 [00:00<?, ?it/s]
| (p,q)x(P,Q) | AIC | |
|---|---|---|
| 0 | (1, 2, 0, 2) | 73260.017704 |
| 1 | (1, 2, 1, 1) | 73260.341933 |
| 2 | (1, 2, 2, 1) | 73260.447554 |
| 3 | (2, 2, 0, 2) | 73261.427256 |
| 4 | (2, 2, 2, 1) | 73262.238920 |
| ... | ... | ... |
| 76 | (2, 0, 0, 0) | 86281.622303 |
| 77 | (1, 1, 0, 0) | 87183.400934 |
| 78 | (0, 1, 0, 0) | 87184.345989 |
| 79 | (1, 0, 0, 0) | 87316.712295 |
| 80 | (0, 0, 0, 0) | 87641.338987 |
81 rows × 2 columns
result_df['AIC'].idxmin()
0
result_df[0:1]
| (p,q)x(P,Q) | AIC | |
|---|---|---|
| 0 | (1, 2, 0, 2) | 73260.017704 |
best_model = SARIMAX(df['meanTemp'], order=(1, 1, 2), seasonal_order=(0, 1, 2, 12)).fit(dis=-1)
print(best_model.summary())
SARIMAX Results
==========================================================================================
Dep. Variable: meanTemp No. Observations: 14530
Model: SARIMAX(1, 1, 2)x(0, 1, 2, 12) Log Likelihood -36633.936
Date: Tue, 02 Aug 2022 AIC 73279.872
Time: 22:09:06 BIC 73325.371
Sample: 0 HQIC 73294.992
- 14530
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.4739 0.011 43.680 0.000 0.453 0.495
ma.L1 -0.5642 0.011 -52.059 0.000 -0.585 -0.543
ma.L2 -0.3079 0.008 -39.839 0.000 -0.323 -0.293
ma.S.L12 -0.9904 0.009 -104.596 0.000 -1.009 -0.972
ma.S.L24 -0.0094 0.007 -1.395 0.163 -0.023 0.004
sigma2 9.0558 0.082 110.877 0.000 8.896 9.216
===================================================================================
Ljung-Box (L1) (Q): 0.22 Jarque-Bera (JB): 17882.21
Prob(Q): 0.64 Prob(JB): 0.00
Heteroskedasticity (H): 0.82 Skew: -1.10
Prob(H) (two-sided): 0.00 Kurtosis: 7.97
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
best_model.plot_diagnostics(figsize=(18, 8))
plt.show()
df['arima_model'] = best_model.fittedvalues
df['arima_model'][0:1] = np.NaN
forecast = best_model.predict(start=df.shape[0], end=df.shape[0] + 180)
forecast = df['arima_model'].append(forecast)
df.to_csv(r'C:\Users\muhammad.usman38\Desktop\test.csv')
df
| meanTemp | Calculated_Date | arima_model | |
|---|---|---|---|
| 0 | 61.5 | 1980-01-01 | NaN |
| 1 | 55.0 | 1980-01-02 | 61.500009 |
| 2 | 60.5 | 1980-01-03 | 54.999833 |
| 3 | 66.5 | 1980-01-04 | 60.499959 |
| 4 | 57.5 | 1980-01-05 | 66.499952 |
| ... | ... | ... | ... |
| 14525 | 77.0 | 2019-12-28 | 74.753703 |
| 14526 | 77.0 | 2019-12-29 | 75.336646 |
| 14527 | 77.5 | 2019-12-30 | 75.472485 |
| 14528 | 74.0 | 2019-12-31 | 75.818053 |
| 14529 | 69.0 | 2020-01-01 | 72.812581 |
14530 rows × 3 columns
plt.figure(figsize=(15, 7.5))
plt.plot(forecast, color='r', label='model')
plt.plot(df['meanTemp'], label='actual')
plt.legend()
plt.show()
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=df["Calculated_Date"], y=df["meanTemp"], name="Actual", mode="lines"))
fig.add_trace(go.Scatter(x=df["Calculated_Date"], y=df["arima_model"], name="Forecast", mode="lines"))
fig.update_layout(
title="Model reults", xaxis_title="Date", yaxis_title="Temp"
)
fig.update_xaxes(rangeslider_visible=True)
fig.show()